library(scran)
library(scater)
library(pheatmap)
library(tidyverse)
# BiocManager::install("scRNAseq")
figure.path <- "C:/Users/jessb/OneDrive/MS-CB/Functional Interpretation of High-Throughput Data/Assignments/Assignment 4"

## Load pancreas data from Baron etal. (2016)
## C:\Users\jessb\AppData\Local\ExperimentHub\ExperimentHub\Cache
baron.sce <- scRNAseq::BaronPancreasData(which = 'human')

## Pancreas cells gene markers. Taken from Table S2, from Baron etal,  Cell Syst. 2016 Oct 26;3(4):346-360.e4.
cell.markers <- tibble(cell=c("Alpha", "Beta", "Delta", "Gamma","Epsilon", "Ductal", 
                              "Acinar", "Stellate","Vascular", "Macrophage", "CytotoxicT",
                              "Mast"), 
                              gene=c("GCG", "INS", "SST", "PPY", "GHRL", "KRT19",
                              "CPA1" ,"PDGFRB", "VWF", "CD163", "CD3D", "TPSAB1"))

## Additional pancreas from Grun etal. (2016)
# grun.sce <- scRNAseq::GrunPancreasData()
Annotate <- function(sce.obj)
{
  
  ## use anyone of these packages to annotate the genes for 
  ## ids, genomic location and description
  # library(Organism.dplyr)
  # library(EnsDb.Hsapiens.v86)
  
  library(AnnotationHub)
  
  ens.GRCh38 <- AnnotationHub()[["AH73881"]]
  
  genenames <- rownames(sce.obj)
  
  geneids <- mapIds(ens.GRCh38,
                    keys = genenames,
                    keytype = "GENENAME",
                    column = "GENEID")

  locations <- mapIds(ens.GRCh38,
                      keys = genenames,
                      keytype = "GENENAME",
                      column = "SEQNAME")

  descriptions <- mapIds(ens.GRCh38,
                         keys = genenames,
                         keytype = "GENENAME",
                         column = "DESCRIPTION")

  ##Add gene annotation
  rowData(sce.obj)$geneids <- geneids
  rowData(sce.obj)$locations <- locations
  rowData(sce.obj)$descriptions <- descriptions
  
  return(sce.obj)
}

## Annotate
baron.sce <- Annotate(baron.sce)
## Warning: Unable to map 1280 of 20125 requested IDs.

## Warning: Unable to map 1280 of 20125 requested IDs.

## Warning: Unable to map 1280 of 20125 requested IDs.
FilterNQC <- function(sce.obj, plot.path=NULL, max_mito_frac=0.15)
{
  library(ggplot2)
  
  # max_mito_frac <- 0.15
  ## find mitochondrial genes
  is.mito <- which(rowData(sce.obj)$locations=="MT")

  ## compute cell QC
  qc.cell <- perCellQCMetrics(sce.obj, subsets=list(Mito=is.mito))
  qc.cell.df <- cbind(as.numeric(qc.cell$sum),
                      as.numeric(qc.cell$detected),
                      as.numeric(qc.cell$subsets_Mito_percent),
                      colData(sce.obj)$donor)
  qc.cell.df <- as.data.frame(qc.cell.df, row.names = colnames(sce.obj))
  colnames(qc.cell.df) = c('sum', 'detected', 'mito_percent', 'donor')

  ## compute gene QC
  # qc.gene <- apply(assay(sce.obj), 2, function(x) length(which(x>0))) 
  # qc.cell.df <- cbind(qc.cell.df, qc.gene)
  # colnames(qc.cell.df) = c('sum', 'detected', 'mito_percent', 'donor', 'genes')
  
  ## violin plot
  violin_cell <- ggplot(qc.cell.df, aes(x=donor, y=mito_percent)) + 
    geom_violin(trim=FALSE) + 
    labs(y = "% of UMIs mitochondrial", x="Donors")
  ggsave("violin_cell.png", plot = violin_cell, path = plot.path)

  violin_gene <- ggplot(qc.cell.df, aes(x=donor, y=detected)) + 
    geom_violin(trim=FALSE) + 
    labs(y = "Number of genes expressed", x="Donors")
  ggsave("violin_gene.png", plot = violin_gene, path = plot.path)

  violin_umi <- ggplot(qc.cell.df, aes(x=donor, y=sum)) +
    geom_violin(trim=FALSE) +
    labs(y = "Number of genes expressed", x="Donors")
  ggsave("violin_umi.png", plot = violin_umi, path = plot.path)

  ## use `quickPerCellQC` function for filtering cells
  ## Add diagnostic plot
  discarded <- quickPerCellQC(qc.cell, 
                              percent_subsets=c("subsets_Mito_percent",
                                                         "detected"))
  colData(sce.obj) <- cbind(colData(sce.obj), qc.cell, discarded)
  
  ## filter by low number of detected genes & plot
  print(plotColData(sce.obj, x="donor", y="detected",
                    colour_by="low_lib_size"))
  print(plotColData(sce.obj, x="donor", y="detected",
                    colour_by="low_n_features"))
  print(plotColData(sce.obj, x="donor", y="detected",
                    colour_by="high_detected"))
  print(plotColData(sce.obj, x="donor", y="subsets_Mito_percent",
                    colour_by="high_subsets_Mito_percent"))

  ## remove cells with high mt
  sce.obj <- sce.obj[, !discarded$high_subsets_Mito_percent]

  return(sce.obj)
}

##Filter
baron.sce <- FilterNQC(baron.sce, plot.path = figure.path)

Normalize <- function(sce.obj, sctransform=FALSE, plot.path=NULL)
{
  ## normalize by deconvolution 
  set.seed(100)
  clust.sce <- quickCluster(sce.obj)
  deconv.sf.sce <- calculateSumFactors(sce.obj, cluster=clust.sce)
  colData(sce.obj) <- cbind(colData(sce.obj), clust.sce, deconv.sf.sce)
  
  ## plot comparison between size factors
  # png(file=paste0(plot.path, "/size_factors.png"))
  # plot(clust.sce, deconv.sf.sce)
  # dev.off()
  col_data <- as.data.frame(colData(sce.obj))
  size_factor <- ggplot(col_data, aes(x = clust.sce, y = deconv.sf.sce)) +
    geom_boxplot() +
    labs(x = "Clusters", y = "Deconvolution size factor")
    
  ggsave("size_factor.png", plot = size_factor, path = plot.path)
  
  ## transform to log normal counts using deconvolution size factor
  sce.obj <- logNormCounts(sce.obj)
  
  ## challenge Q: alternative use of sctransform
  
  return(sce.obj)
}

## Normalize
baron.sce <- Normalize(baron.sce, sctransform=FALSE, plot.path= figure.path)
FeatureSelection <-function(sce.obj, plot.path=NULL)
{
 
  ## model gene variance
  dec <- modelGeneVar(sce.obj)
  # dec.df <- as.data.frame(dec)

  ## plot mean- variadec.df
  plot(dec$mean, dec$total, xlab="Mean log-expression", ylab="Variance")
  print(curve(metadata(dec)$trend(x), col="blue", add=TRUE))

  ## select HVG either by number or FDR threshold
  hvg <- getTopHVGs(dec, fdr.threshold=0.05)

  return(hvg)
}

## Feature selection
baron.hvg <- FeatureSelection(baron.sce, plot.path = figure.path)

## $x
##   [1] -5.551115e-17  5.382593e-02  1.076519e-01  1.614778e-01  2.153037e-01
##   [6]  2.691296e-01  3.229556e-01  3.767815e-01  4.306074e-01  4.844333e-01
##  [11]  5.382593e-01  5.920852e-01  6.459111e-01  6.997371e-01  7.535630e-01
##  [16]  8.073889e-01  8.612148e-01  9.150408e-01  9.688667e-01  1.022693e+00
##  [21]  1.076519e+00  1.130344e+00  1.184170e+00  1.237996e+00  1.291822e+00
##  [26]  1.345648e+00  1.399474e+00  1.453300e+00  1.507126e+00  1.560952e+00
##  [31]  1.614778e+00  1.668604e+00  1.722430e+00  1.776256e+00  1.830082e+00
##  [36]  1.883907e+00  1.937733e+00  1.991559e+00  2.045385e+00  2.099211e+00
##  [41]  2.153037e+00  2.206863e+00  2.260689e+00  2.314515e+00  2.368341e+00
##  [46]  2.422167e+00  2.475993e+00  2.529819e+00  2.583645e+00  2.637470e+00
##  [51]  2.691296e+00  2.745122e+00  2.798948e+00  2.852774e+00  2.906600e+00
##  [56]  2.960426e+00  3.014252e+00  3.068078e+00  3.121904e+00  3.175730e+00
##  [61]  3.229556e+00  3.283382e+00  3.337208e+00  3.391033e+00  3.444859e+00
##  [66]  3.498685e+00  3.552511e+00  3.606337e+00  3.660163e+00  3.713989e+00
##  [71]  3.767815e+00  3.821641e+00  3.875467e+00  3.929293e+00  3.983119e+00
##  [76]  4.036945e+00  4.090771e+00  4.144596e+00  4.198422e+00  4.252248e+00
##  [81]  4.306074e+00  4.359900e+00  4.413726e+00  4.467552e+00  4.521378e+00
##  [86]  4.575204e+00  4.629030e+00  4.682856e+00  4.736682e+00  4.790508e+00
##  [91]  4.844333e+00  4.898159e+00  4.951985e+00  5.005811e+00  5.059637e+00
##  [96]  5.113463e+00  5.167289e+00  5.221115e+00  5.274941e+00  5.328767e+00
## [101]  5.382593e+00
## 
## $y
##   [1]       NaN 0.0653179 0.1300124 0.1926598 0.2534547 0.3122463 0.3689155
##   [8] 0.4233695 0.4755390 0.5253746 0.5728460 0.6179450 0.6606961 0.7012090
##  [15] 0.7397540 0.7762576 0.8108621 0.8439199 0.8753240 0.9047061 0.9316127
##  [22] 0.9560617 0.9784777 0.9992314 1.0187099 1.0370315 1.0542348 1.0702803
##  [29] 1.0852924 1.0991929 1.1112657 1.1209015 1.1277813 1.1323933 1.1354633
##  [36] 1.1365731 1.1363638 1.1350638 1.1336649 1.1317839 1.1273946 1.1188406
##  [43] 1.1061021 1.0921572 1.0724377 1.0525400 1.0313591 1.0100198 0.9885689
##  [50] 0.9675635 0.9475716 0.9273206 0.9067664 0.8861958 0.8659772 0.8470074
##  [57] 0.8284294 0.8110407 0.7938844 0.7782651 0.7635266 0.7498072 0.7365415
##  [64] 0.7241412 0.7132220 0.7033238 0.6964545 0.6888344 0.6817823 0.6747141
##  [71] 0.6664680 0.6585203 0.6505743 0.6426918 0.6367655 0.6321084 0.6274923
##  [78] 0.6234382 0.6196217 0.6165142 0.6133865 0.6102411 0.6071731 0.6047017
##  [85] 0.6022128 0.5997082 0.5970636 0.5941421 0.5912137 0.5882797 0.5853413
##  [92] 0.5824004 0.5794584 0.5765163 0.5735755 0.5706368 0.5677014 0.5647701
##  [99] 0.5618439 0.5589236 0.5560100
Cluster <- function(sce.obj, hvg.obj, plot.path=NULL)
{
  ## remeber to set.seed for reproducible results
  ## moved up as scater manual suggested set seed for runPCA
  set.seed(100)
  
  ## run PCA
  sce.obj <- runPCA(sce.obj, subset_row=hvg.obj)
  attr(reducedDim(sce.obj), "percentVar") <- attr(reducedDim(sce.obj),
                                                  "percentVar")/100

  ## select meaningful number of dimensions and plot
  y <- attr(reducedDim(sce.obj), "percentVar")
  x <- seq(1, length(y))
  print(plot(x, y, main = "Variance Explained by Principal Components",
             xlab = "PC", ylab = "Percent Variance Explained"))
  
  ## intended to collect PCs that explained 80% of var, but >50
  # for (n in x){
  #  if (sum(y[1:n]) < 0.8){
  #    next
  #  } else {
  #    optimal_n <- n
  #    print(cat("The number of principal components that explain 
  #              80% of the variation are", optimal_))
  #    break
  #  }
  # }

  ## reduce dim by PCA and plot
  print(plotPCA(sce.obj, ncomponents = 7, colour_by = "label"))

  ## run TSNE and plot
  ## ran on pre-existing PCA results to speed via low rank approximation
  ## selected number of PCs to include using elbow method
  sce.obj <- runTSNE(sce.obj, perplexity=50, dimred="PCA", n_dimred=7)
  print(plotReducedDim(sce.obj, dimred = "TSNE", colour_by = "label"))

  ## run UMAP and plot
  library(uwot)
  sce.obj <- runUMAP(sce.obj)
  print(plotReducedDim(sce.obj, dimred = "UMAP", colour_by = "label"))

  ## build shared nearest-neighbor graph and plot
  g <- buildSNNGraph(sce.obj, use.dimred="PCA")
  clusters <- igraph::cluster_louvain(g)$membership
  sce.obj$clusters <- factor(clusters)
  print(table(sce.obj$clusters))
  print(plotReducedDim(sce.obj, dimred = "TSNE", 
                       colour_by = "clusters", text_by = "clusters"))

  ## cluster modularity and heatmap
  ratio <- clusterModularity(g, clusters, as.ratio=TRUE)
  print(pheatmap(log10(ratio+1), cluster_cols=FALSE, cluster_rows=FALSE,
                 col=rev(heat.colors(100))))

  ## graph of clusters and plot
  ## not sure what's supposed to be here - thought I graphed SNN clusters above

  return(sce.obj)
}

## cluster
baron.sce <- Cluster(baron.sce, baron.hvg, plot.path = figure.path)

## NULL

## Warning: package 'uwot' was built under R version 3.6.3
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## The following object is masked from 'package:S4Vectors':
## 
##     expand

## 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
##  236  933  143  886  627  826  761   86 1133  263  459  809  237  952   11  207

MarkerGenes <- function(sce.obj, plot.path=NULL)
{
  ## Find stringent markers. Only genes that are unique to each cluster 
  ## are identified. e.g. Insulin will be missed
  markers_unique <- findMarkers(sce.obj, sce.obj$clusters, pval.type="all")
  
  ## find any markers
  markers_any <- findMarkers(sce.obj, sce.obj$clusters, pval.type="any")
  
  ## plot insulin marker
  INS_fold <- 1
  for (n in seq(3, length(markers_any)+3-1)){
    INS_fold <- c(INS_fold, markers_any[[1]][grep("^INS$",
                                                  rownames(markers_any[[1]])), n])
  }
  
  INS <- numeric(length=length(sce.obj$clusters))
  for (n in seq(1, length(sce.obj$clusters))){
    INS[n] <- INS_fold[sce.obj$clusters[n]]
  }
  sce.obj$INS <- INS
  
  print(plotReducedDim(sce.obj, dimred = "TSNE",
                       colour_by = "INS", text_by = "clusters"))
  
  ## find and plot markers for a specific cluster
  test <- as.data.frame(markers_unique[[1]][2], row.names = rownames(markers_unique))
  clust1_heatmap <- as.matrix(markers_unique[[1]]
                              [which(test<0.05), 2:length(markers_unique)+1])
  
  print(pheatmap(clust1_heatmap, main = "Cluster 1 logFC for DE Markers"))
  
  return(markers_unique)
}

## Marker genes
baron.markers <- MarkerGenes(baron.sce, plot.path = figure.path)

AnnotateClusters <- function(sce.obj, type.markers, plot.path=NULL)
{
  library(AUCell)
  cells_rankings <- AUCell_buildRankings(assay(sce.obj))
  
  library(GSEABase)
  for (n in seq(1, dim(type.markers)[1])){
    assign(paste0("gs", n), 
           GeneSet(type.markers$gene[n], setName=type.markers$cell[n]))
  }
  
  ## didn't have time to generalize this 
  # names_gs <- paste("gs", c(1:dim(type.markers)[1]), sep = "")
  gsc <- GeneSetCollection(gs1, gs2, gs3, gs4, gs5, gs6,
                           gs7, gs8, gs9, gs10, gs11, gs12)

  ## Annotate the clusters using type.markers
  cells_AUC <- AUCell_calcAUC(gsc, cells_rankings,
                              aucMaxRank=nrow(cells_rankings)*0.05)
  
  # par(mfrow=c(4,3))
  cells_assignment <- AUCell_exploreThresholds(cells_AUC, plotHist=TRUE, 
                                               nCores=1, assign=TRUE)
  
  selectedThresholds <- getThresholdSelected(cells_assignment)
  # par(mfrow=c(4,3))
  for (geneSetName in names(selectedThresholds)){
    nBreaks <- 5 # Number of levels in the color palettes
    ## Color palette for the cells that do not pass the threshold
    colorPal_Neg <- grDevices::colorRampPalette(c("black","blue", "skyblue"))(nBreaks)
    ## Color palette for the cells that pass the threshold
    colorPal_Pos <- grDevices::colorRampPalette(c("pink", "magenta", "red"))(nBreaks)
    
    ## Split cells according to their AUC value for the gene set
    passThreshold <- getAUC(cells_AUC)[geneSetName,] >  selectedThresholds[geneSetName]
    if(sum(passThreshold) >0 ){
      aucSplit <- split(getAUC(cells_AUC)[geneSetName,], passThreshold)
      ## Assign cell color
      cellColor <- c(setNames(colorPal_Neg[cut(aucSplit[[1]], breaks=nBreaks)],
                              names(aucSplit[[1]])), setNames(colorPal_Pos[cut(aucSplit[[2]],
                                                                              breaks=nBreaks)],
                                                              names(aucSplit[[2]])))
      
      plot(reducedDim(sce.obj, "TSNE"), main=geneSetName,
           sub="Pink/red cells pass the threshold", xlab="", ylab="",
           col=cellColor[rownames(reducedDim(sce.obj, "TSNE"))], pch=16) 
    }
  }
}

## Annotate clusters
AnnotateClusters(baron.sce, cell.markers, plot.path = figure.path)

##     min      1%      5%     10%     50%    100% 
##  767.00  845.36  954.00 1086.00 1847.00 4922.00

## save SCE object
# saveRDS(baron.sce, file = paste0("../data/BaronHumanSCE_", Sys.Date(), ".Rds"))

Resources

https://bioconductor.org/packages/release/bioc/vignettes/scater/inst/doc/overview.html

https://osca.bioconductor.org/clustering.html

https://bioconductor.org/packages/devel/bioc/vignettes/scran/inst/doc/scran.html#6_graph-based_clustering

https://www.bioconductor.org/packages/release/bioc/vignettes/AUCell/inst/doc/AUCell.html